In [640]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import math
from sklearn import metrics
from matplotlib.mlab import PCA as mlabPCA
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn import preprocessing
from sklearn.feature_selection import SelectKBest
import seaborn as sns
import scipy.stats as stats
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score, KFold
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV
from sklearn.datasets import make_classification
from sklearn.feature_selection import RFE
from sklearn.model_selection import cross_val_predict
from sklearn import metrics
from sklearn.decomposition import PCA as sklearn_pca
import locale
from locale import atof
import warnings
from IPython.display import display
from sklearn import linear_model
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.feature_selection import f_regression
import statsmodels.formula.api as smf
from statsmodels.sandbox.regression.predstd import wls_prediction_std
import xlrd
from sklearn import ensemble
import time
from sklearn.model_selection import cross_val_score, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.utils import resample
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn import ensemble
from sklearn import datasets
from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_error
from sklearn import decomposition
from surprise import SVD, evaluate
from sklearn import neighbors
from sklearn.model_selection import cross_val_score, train_test_split, KFold
from sklearn import ensemble
In [641]:
# Read and import data
answers = pd.read_csv('WOW Raw Data HST n468.csv', encoding='latin-1')
answers.head()
Out[641]:
In [642]:
#Analyse types of variables
answers.info()
In [643]:
#Identify all the columns in the file
answers.columns
Out[643]:
In [644]:
#Identify unique values in the Section Area
answers.Section.unique()
Out[644]:
In [645]:
#Drop additional information that we are not using for the model
answers1 = answers.drop(['StartDate', 'EndDate', 'Status', 'IPAddress', 'Progress',
'Duration (in seconds)', 'Finished', 'RecordedDate', 'ResponseId',
'RecipientLastName', 'RecipientFirstName', 'RecipientEmail',
'ExternalReference', 'LocationLatitude', 'LocationLongitude',
'DistributionChannel', 'UserLanguage','Program','Type of Degree','City', 'Country',
'PostCode', 'Age', 'Birth', 'Nationality', 'Degree', 'Title',
'University', 'PrivateEmail', 'Address', 'Identification','University - Topics','Q1'],axis = 1)
In [646]:
#Rename columns with row 0 from the original data set
answers1 = answers1.rename(columns=answers.iloc[0])
#Drop row 1 with system information and reset index
answers1 = answers1.drop(answers.index[0:3]).reset_index(drop=True)
answers1.head()
Out[646]:
In [647]:
#Identify all the columns in the file
answers1.columns
Out[647]:
Subset of data to see responses, programs, gender, etc...
In [648]:
answers2 = answers1[['Section','Gender','Catalyst_Score', 'Orderer_Score',
'Influencer_Score', 'Benefactor_Score', 'Harmonizer_Score',
'Investigator_Score', 'Quantifier_Score', 'Distiller_Score',
'Innovator_Score', 'Creator_Score']]
answers2.head()
Out[648]:
In [649]:
#Assign values to Gender: Male = 0, Female = 1
answers2['Gender'] = answers2['Gender'].map({'Female': 1,'Mujer': 1, 'Male': 0,'Hombre': 0 })
#Map Sections to Programs - high level (i.e. GMBD & MBD = MBD)
## Not sure about MCC & EMCC for the moment I keep them separate
answers2['Section'] = answers2['Section'].map({'MBD-01': 'MBD',
'MBD-02': 'MBD',
'GMBD-01': 'MBD',
'MCC-01': 'MCC',
'MCC-02': 'MCC',
'MRCB-01': 'MRCB',
'MRCB-02': 'MRCB',
'MVDM-01': 'MVDM',
'MVDM-02': 'MVDM',
'MTDHR-01': 'MTDHR',
'BIP-2014': 'BIP',
'BIP-2015': 'BIP',
'BIP-2016': 'BIP',
'BIP-2017':'BIP',
'BIS-2016':'BIS',
'BIS-2017':'BIS',
'BIC-2014': 'BIC',
'BIC-2015': 'BIC',
'BIC-2016': 'BIC',
'BIC-2017':'BIC',
'MCXI-01': 'MCXI',
'MCS-01':'MCS',
'EXMPLS-01':'EXMPLS',
'EMCC-01':'MCC'
})
In [650]:
answers2.head()
Out[650]:
In [651]:
#New names for Programs
answers2.Section.unique()
Out[651]:
In [652]:
answers2.info()
In [653]:
answers2['Gender'].value_counts()
Out[653]:
In [654]:
answers2['Section'].value_counts()
Out[654]:
Relationship between variables - Raw
In [655]:
ratings = answers1[['Section','Gender','Catalyst-Driving', 'Catalyst-Orchestrating', 'Catalyst-Activating',
'Orderer-Ordering', 'Orderer-Risk Reducing', 'Orderer-Policing',
'Influencer-Communicating', 'Influencer-Advocating',
'Influencer-Selling', 'Benefactor-Defending', 'Benefactor-Empathizing',
'Benefactor-Developing', 'Harmonizer-Including',
'Harmonizer-Conflict Reducing', 'Harmonizer-Consensus Building',
'Investigator-Drilling', 'Investigator-Dissecting',
'Investigator-Explaining', 'Quantifier-Measuring',
'Quantifier-Pattern Finding', 'Quantifier-Modeling',
'Distiler-Packaging', 'Distiler-Simplifying', 'Distiller-Connecting',
'Innovator-Disrupting', 'Innovator-Brainstorming', 'Innovator-Testing',
'Creator-Creating', 'Creator-Making', 'Creator-Expressing']]
#Map Sections to Programs - high level (i.e. GMBD & MBD = MBD)
## Not sure about MCC & EMCC for the moment I keep them separate
ratings['Section'] = answers1['Section'].map({'MBD-01': 'MBD',
'MBD-02': 'MBD',
'GMBD-01': 'MBD',
'MCC-01': 'MCC',
'MCC-02': 'MCC',
'MRCB-01': 'MRCB',
'MRCB-02': 'MRCB',
'MVDM-01': 'MVDM',
'MVDM-02': 'MVDM',
'MTDHR-01': 'MTDHR',
'BIP-2014': 'BIP',
'BIP-2015': 'BIP',
'BIP-2016': 'BIP',
'BIP-2017':'BIP',
'BIS-2016':'BIS',
'BIS-2017':'BIS',
'BIC-2014': 'BIC',
'BIC-2015': 'BIC',
'BIC-2016': 'BIC',
'BIC-2017':'BIC',
'MCXI-01': 'MCXI',
'MCS-01':'MCS',
'EXMPLS-01':'EXMPLS',
'EMCC-01':'MCC'
})
In [656]:
#ratings = ratings[ratings.Section != 'BIC']
#ratings = ratings[ratings.Section != 'BIS']
#ratings = ratings[ratings.Section != 'BIP']
ratings1 = ratings.drop(['Gender','Section'], axis = 1)
print(ratings.Section.unique())
ratings1.head()
Out[656]:
In [657]:
from sklearn.preprocessing import scale
ratings1 = ratings1.apply(lambda V: scale(V,axis=0,with_mean=True, with_std=True,copy=False),axis=1)
In [658]:
#Correlation Matrix between all answers
corrmat = ratings1.corr(method='spearman')
#plot pivot table as heatmap using seaborn
plt.figure(figsize=(30, 10))
ax = sns.heatmap(corrmat, annot=True)
plt.tight_layout()
plt.show()
In [659]:
ratings2 = ratings['Section']
In [660]:
ratings3 = pd.concat([ratings2, ratings1], axis=1)
ratings3.head()
Out[660]:
In [661]:
ratings3.Section.unique()
Out[661]:
In [662]:
means = ratings3.groupby(['Section']).mean()
In [663]:
#plot pivot table as heatmap using seaborn
plt.figure(figsize=(30, 10))
ax = sns.heatmap(means, annot=True)
#plt.setp(ax, rotation=90 )
#plt.tight_layout()
plt.show()
In [664]:
stdmeans = pd.DataFrame(means.std())
stdmeans
Out[664]:
In [665]:
#Assign values to Gender: Male = 0, Female = 1
ratings['Gender'] = ratings['Gender'].map({'Female': 'Female','Mujer': 'Female', 'Male': 'Male','Hombre': 'Male' })
ratings5 = ratings['Gender']
ratings4 = pd.concat([ratings5, ratings3], axis=1)
ratings4.head()
Out[665]:
In [666]:
gendermean = ratings4.groupby(['Section','Gender']).mean()
In [667]:
#plot pivot table as heatmap using seaborn
plt.figure(figsize=(30, 10))
ax = sns.heatmap(gendermean, annot=True, square=True)
plt.setp(ax.xaxis.get_majorticklabels(), rotation=90 )
#plt.tight_layout()
plt.show()
In [668]:
genderstd = ratings4.groupby(['Section','Gender']).std()
genderstd
Out[668]:
In [669]:
#plot pivot table as heatmap using seaborn
plt.figure(figsize=(30, 10))
ax = sns.heatmap(genderstd, annot=True, square=True)
plt.setp(ax.xaxis.get_majorticklabels(), rotation=90 )
#plt.tight_layout()
plt.show()
In [670]:
ratings3['Section'].unique()
Out[670]:
In [671]:
ratings3['Section'] = ratings3['Section'].map({'MCC' :0,'MBD':1,'MVDM':2,'MRCB' : 3, 'MTDHR': 4, 'BIP' : 5, 'MCXI' : 6,'MCS' : 8, 'BIC' : 9, 'EXMPLS' : 10, 'BIS' : 11})
In [672]:
ratings3['Section'].unique()
Out[672]:
In [673]:
#Decision tree, which question is giving us more information
#Convert data so that we can run a tree
X1 = ratings3.drop(['Section'],axis=1)
Y1 = ratings3.Section.values
# This is the model we'll be using.
from sklearn import tree
# A convenience for displaying visualizations.
from IPython.display import Image
# Packages for rendering our tree.
import pydotplus
import graphviz
# Initialize and train our tree.
decision_tree = tree.DecisionTreeClassifier(
criterion='gini',
max_features=2,
random_state = 10
)
decision_tree.fit(X1,Y1)
# Render tree.
dot_data = tree.export_graphviz(
decision_tree, out_file=None,
feature_names=X1.columns,
class_names=ratings2.unique(),
filled=True
)
from pydotplus import graphviz
graph = pydotplus.graph_from_dot_data(dot_data)
Image(graph.create_png())
In [674]:
# Convert data for second feature selection method
X = np.array(ratings3.drop(['Section'],axis=1))
Y = np.array(ratings3['Section'])
In [675]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from itertools import product
In [676]:
clf1 = DecisionTreeClassifier(max_depth=4)
clf2 = KNeighborsClassifier(n_neighbors=7)
clf3 = SVC(kernel='rbf', probability=True)
eclf = VotingClassifier(estimators=[('dt', clf1), ('knn', clf2),
('svc', clf3)],
voting='soft', weights=[2, 1, 2])
clf1.fit(X, Y)
clf2.fit(X, Y)
clf3.fit(X, Y)
eclf.fit(X, Y)
Out[676]:
Potential Order of Questions (based on best description of the overall model)
EXMPLS & Bachelors out in the first questions
['MCC', 'MBD', 'MVDM', 'MRCB', 'MTDHR','MCXI', 'MCS']
In [677]:
ratings6 = ratings4
ratings6 = ratings4[ratings4.Section != 'BIP']
ratings7 = ratings6[ratings6.Section != 'EXMPLS']
ratings8 = ratings7[ratings7.Section != 'BIC']
ratings9 = ratings8[ratings8.Section != 'BIS']
ratings10 = ratings9
ratings10.Section.unique()
Out[677]:
In [678]:
#Feature Selection. Scores for the most relevant features (should we start with the one that has more explanatory power)
from sklearn.feature_selection import SelectKBest
# feature extraction
test = SelectKBest()
fit = test.fit(X10, Y10)
# summarize scores
#print(fit.scores_)
#features = fit.transform(X)
#list(zip(X1.columns, features))
#Identify features with highest score from a predictive perspective (for all programs)
names2 = X1.columns
Bestfeatures = pd.DataFrame(fit.scores_, index = names2)
Bestfeatures.columns = ['Potential Order of Questions']
Bestfeatures.sort_values(by=['Potential Order of Questions'], ascending=False)
Out[678]:
MCC
In [679]:
ratings10['Section'] = ratings10['Section'].map({'MCC' :1,'MBD':0,'MVDM':0,'MRCB' : 0, 'MTDHR': 0, 'MCXI' : 0,'MCS' : 0, })
ratings10.Section.unique()
Out[679]:
In [680]:
#Upsample the minority class
# Separate majority and minority classes
ratings10_majority = ratings10[ratings10.Section==0]
ratings10_minority = ratings10[ratings10.Section==1]
# Upsample minority class
ratings10_minority_upsampled = resample(ratings10_minority, replace=True, n_samples=268, random_state=123)
# Combine majority class with upsampled minority class
ratings10_upsampled = pd.concat([ratings10_majority, ratings10_minority_upsampled])
# Display new class counts
ratings10_upsampled.Section.value_counts()
Out[680]:
In [681]:
#Decision tree, which question is giving us more information
#Convert data so that we can run a tree
X2 = ratings10_upsampled.drop(['Section','Gender'],axis=1)
Y2 = ratings10_upsampled.Section.values
In [682]:
clf1 = DecisionTreeClassifier(max_depth=4)
clf2 = KNeighborsClassifier(n_neighbors=7)
clf3 = SVC(kernel='rbf', probability=True)
eclf = VotingClassifier(estimators=[('dt', clf1), ('knn', clf2),
('svc', clf3)],
voting='soft', weights=[1, 1, 1])
from sklearn.naive_bayes import GaussianNB
clf4 = GaussianNB()
In [683]:
# We'll make 500 iterations, use 2-deep trees, and set our loss function.
params = {'n_estimators': 500,
'max_depth': 2,
'loss': 'deviance'}
#clf1 = ensemble.GradientBoostingClassifier(**params)
clf1.fit(X2, Y2)
feature_importance2 = clf1.feature_importances_
# Make importances relative to max importance.
feature_importance2 = 100.0 * (feature_importance2 / feature_importance2.max())
sorted_idx = np.argsort(feature_importance2)
pos = np.arange(sorted_idx.shape[0]) + .5
plt.figure(figsize=(7, 30))
plt.subplot(1, 1, 1)
plt.barh(pos, feature_importance2[sorted_idx], align='center')
plt.yticks(pos, X2.columns[sorted_idx])
plt.xlabel('Relative Importance')
plt.title('MCC')
plt.show()
MBD
In [684]:
ratings8.Section.unique()
Out[684]:
In [685]:
ratings11 = ratings8[ratings8.Section != 'BIS']
ratings11['Section'] = ratings11['Section'].map({'MCC' :0,'MBD':1,'MVDM':0,'MRCB' : 0, 'MTDHR': 0, 'MCXI' : 0,'MCS' : 0 })
ratings11.Section.unique()
ratings11.Section.value_counts()
Out[685]:
In [686]:
#Upsample the minority class
# Separate majority and minority classes
ratings11_majority = ratings11[ratings11.Section==0]
ratings11_minority = ratings11[ratings11.Section==1]
# Upsample minority class
ratings11_minority_upsampled = resample(ratings11_minority, replace=True, n_samples=228, random_state=123)
# Combine majority class with upsampled minority class
ratings11_upsampled = pd.concat([ratings11_majority, ratings11_minority_upsampled])
# Display new class counts
ratings11_upsampled.Section.value_counts()
Out[686]:
In [687]:
#Decision tree, which question is giving us more information
#Convert data so that we can run a tree
X3 = ratings11_upsampled.drop(['Section','Gender'],axis=1)
Y3 = ratings11_upsampled.Section.values
In [688]:
# We'll make 500 iterations, use 2-deep trees, and set our loss function.
params = {'n_estimators': 500,
'max_depth': 2,
'loss': 'deviance'}
#clf1 = ensemble.GradientBoostingClassifier(**params)
clf1.fit(X3, Y3)
feature_importance3 = clf1.feature_importances_
# Make importances relative to max importance.
feature_importance3 = 100.0 * (feature_importance3 / feature_importance3.max())
sorted_idx = np.argsort(feature_importance3)
pos = np.arange(sorted_idx.shape[0]) + .5
plt.figure(figsize=(7, 30))
plt.subplot(1, 1, 1)
plt.barh(pos, feature_importance3[sorted_idx], align='center')
plt.yticks(pos, X3.columns[sorted_idx])
plt.xlabel('Relative Importance')
plt.title('MBD')
plt.show()
MVDM
In [689]:
ratings8.Section.unique()
Out[689]:
In [690]:
ratings12 = ratings8[ratings8.Section != 'BIS']
ratings12['Section'] = ratings12['Section'].map({'MCC' :0,'MBD':0,'MVDM':1,'MRCB' : 0, 'MTDHR': 0, 'MCXI' : 0,'MCS' : 0 })
ratings12.Section.unique()
ratings12.Section.value_counts()
Out[690]:
In [691]:
#Upsample the minority class
# Separate majority and minority classes
ratings12_majority = ratings12[ratings12.Section==0]
ratings12_minority = ratings12[ratings12.Section==1]
# Upsample minority class
ratings12_minority_upsampled = resample(ratings12_minority, replace=True, n_samples=282, random_state=123)
# Combine majority class with upsampled minority class
ratings12_upsampled = pd.concat([ratings12_majority, ratings12_minority_upsampled])
# Display new class counts
ratings12_upsampled.Section.value_counts()
Out[691]:
In [692]:
#Decision tree, which question is giving us more information
#Convert data so that we can run a tree
X4 = ratings12_upsampled.drop(['Section','Gender'],axis=1)
Y4 = ratings12_upsampled.Section.values
In [693]:
# We'll make 500 iterations, use 2-deep trees, and set our loss function.
params = {'n_estimators': 500,
'max_depth': 2,
'loss': 'deviance'}
#clf1 = ensemble.GradientBoostingClassifier(**params)
clf1.fit(X4, Y4)
feature_importance4 = clf1.feature_importances_
# Make importances relative to max importance.
feature_importance4 = 100.0 * (feature_importance4/ feature_importance4.max())
sorted_idx = np.argsort(feature_importance4)
pos = np.arange(sorted_idx.shape[0]) + .5
plt.figure(figsize=(7, 30))
plt.subplot(1, 1, 1)
plt.barh(pos, feature_importance4[sorted_idx], align='center')
plt.yticks(pos, X4.columns[sorted_idx])
plt.xlabel('Relative Importance')
plt.title('MVDM')
plt.show()
MRCB
In [694]:
ratings8.Section.unique()
Out[694]:
In [695]:
ratings13 = ratings8[ratings8.Section != 'BIS']
ratings13['Section'] = ratings13['Section'].map({'MCC' :0,'MBD':0,'MVDM':0,'MRCB' : 1, 'MTDHR': 0, 'MCXI' : 0,'MCS' : 0 })
ratings13.Section.unique()
ratings13.Section.value_counts()
Out[695]:
In [696]:
#Upsample the minority class
# Separate majority and minority classes
ratings13_majority = ratings13[ratings13.Section==0]
ratings13_minority = ratings13[ratings13.Section==1]
# Upsample minority class
ratings13_minority_upsampled = resample(ratings13_minority, replace=True, n_samples=281, random_state=123)
# Combine majority class with upsampled minority class
ratings13_upsampled = pd.concat([ratings13_majority, ratings13_minority_upsampled])
# Display new class counts
ratings13_upsampled.Section.value_counts()
Out[696]:
In [697]:
#Decision tree, which question is giving us more information
#Convert data so that we can run a tree
X5 = ratings13_upsampled.drop(['Section','Gender'],axis=1)
Y5 = ratings13_upsampled.Section.values
In [698]:
# We'll make 500 iterations, use 2-deep trees, and set our loss function.
params = {'n_estimators': 500,
'max_depth': 2,
'loss': 'deviance'}
#clf1 = ensemble.GradientBoostingClassifier(**params)
clf1.fit(X5, Y5)
feature_importance5 = clf1.feature_importances_
# Make importances relative to max importance.
feature_importance5 = 100.0 * (feature_importance5 / feature_importance5.max())
sorted_idx = np.argsort(feature_importance5)
pos = np.arange(sorted_idx.shape[0]) + .5
plt.figure(figsize=(7, 30))
plt.subplot(1, 1, 1)
plt.barh(pos, feature_importance5[sorted_idx], align='center')
plt.yticks(pos, X5.columns[sorted_idx])
plt.xlabel('Relative Importance')
plt.title('MRCB')
plt.show()
MTDHR
In [699]:
ratings8.Section.unique()
Out[699]:
In [700]:
ratings14 = ratings8[ratings8.Section != 'BIS']
ratings14['Section'] = ratings14['Section'].map({'MCC' :0,'MBD':0,'MVDM':0,'MRCB' : 0, 'MTDHR': 1, 'MCXI' : 0,'MCS' : 0 })
ratings14.Section.unique()
ratings14.Section.value_counts()
Out[700]:
In [701]:
#Upsample the minority class
# Separate majority and minority classes
ratings14_majority = ratings14[ratings14.Section==0]
ratings14_minority = ratings14[ratings14.Section==1]
# Upsample minority class
ratings14_minority_upsampled = resample(ratings14_minority, replace=True, n_samples=321, random_state=123)
# Combine majority class with upsampled minority class
ratings14_upsampled = pd.concat([ratings14_majority, ratings14_minority_upsampled])
# Display new class counts
ratings14_upsampled.Section.value_counts()
Out[701]:
In [702]:
#Decision tree, which question is giving us more information
#Convert data so that we can run a tree
X6 = ratings14_upsampled.drop(['Section','Gender'],axis=1)
Y6 = ratings14_upsampled.Section.values
In [703]:
# We'll make 500 iterations, use 2-deep trees, and set our loss function.
params = {'n_estimators': 500,
'max_depth': 2,
'loss': 'deviance'}
#clf1 = ensemble.GradientBoostingClassifier(**params)
clf1.fit(X6, Y6)
feature_importance6 = clf1.feature_importances_
# Make importances relative to max importance.
feature_importance6 = 100.0 * (feature_importance6 / feature_importance6.max())
sorted_idx = np.argsort(feature_importance6)
pos = np.arange(sorted_idx.shape[0]) + .5
plt.figure(figsize=(7, 30))
plt.subplot(1, 1, 1)
plt.barh(pos, feature_importance6[sorted_idx], align='center')
plt.yticks(pos, X6.columns[sorted_idx])
plt.xlabel('Relative Importance')
plt.title('MTDHR')
plt.show()
MCXI
In [704]:
ratings8.Section.unique()
Out[704]:
In [705]:
ratings15 = ratings8[ratings8.Section != 'BIS']
ratings15['Section'] = ratings15['Section'].map({'MCC' :0,'MBD':0,'MVDM':0,'MRCB' : 0, 'MTDHR': 0, 'MCXI' : 1,'MCS' : 0 })
ratings15.Section.unique()
ratings15.Section.value_counts()
Out[705]:
In [706]:
#Upsample the minority class
# Separate majority and minority classes
ratings15_majority = ratings15[ratings15.Section==0]
ratings15_minority = ratings15[ratings15.Section==1]
# Upsample minority class
ratings15_minority_upsampled = resample(ratings15_minority, replace=True, n_samples=316, random_state=123)
# Combine majority class with upsampled minority class
ratings15_upsampled = pd.concat([ratings15_majority, ratings15_minority_upsampled])
# Display new class counts
ratings15_upsampled.Section.value_counts()
Out[706]:
In [707]:
#Decision tree, which question is giving us more information
#Convert data so that we can run a tree
X7 = ratings15_upsampled.drop(['Section','Gender'],axis=1)
Y7 = ratings15_upsampled.Section.values
In [708]:
# We'll make 500 iterations, use 2-deep trees, and set our loss function.
params = {'n_estimators': 500,
'max_depth': 2,
'loss': 'deviance'}
#clf1 = ensemble.GradientBoostingClassifier(**params)
clf1.fit(X7, Y7)
feature_importance7 = clf1.feature_importances_
# Make importances relative to max importance.
feature_importance7 = 100.0 * (feature_importance7 / feature_importance7.max())
sorted_idx = np.argsort(feature_importance7)
pos = np.arange(sorted_idx.shape[0]) + .5
plt.figure(figsize=(7, 30))
plt.subplot(1, 1, 1)
plt.barh(pos, feature_importance7[sorted_idx], align='center')
plt.yticks(pos, X7.columns[sorted_idx])
plt.xlabel('Relative Importance')
plt.title('MCXI')
plt.show()
MCS
In [709]:
ratings8.Section.unique()
Out[709]:
In [710]:
ratings16 = ratings8[ratings8.Section != 'BIS']
ratings16['Section'] = ratings16['Section'].map({'MCC' :0,'MBD':0,'MVDM':0,'MRCB' : 0, 'MTDHR': 0, 'MCXI' : 0,'MCS' : 1 })
ratings16.Section.unique()
ratings16.Section.value_counts()
Out[710]:
In [711]:
#Upsample the minority class
# Separate majority and minority classes
ratings16_majority = ratings16[ratings16.Section==0]
ratings16_minority = ratings16[ratings16.Section==1]
# Upsample minority class
ratings16_minority_upsampled = resample(ratings16_minority, replace=True, n_samples=326, random_state=123)
# Combine majority class with upsampled minority class
ratings16_upsampled = pd.concat([ratings16_majority, ratings16_minority_upsampled])
# Display new class counts
ratings16_upsampled.Section.value_counts()
Out[711]:
In [712]:
#Decision tree, which question is giving us more information
#Convert data so that we can run a tree
X8 = ratings16_upsampled.drop(['Section','Gender'],axis=1)
Y8 = ratings16_upsampled.Section.values
In [713]:
# We'll make 500 iterations, use 2-deep trees, and set our loss function.
params = {'n_estimators': 500,
'max_depth': 2,
'loss': 'deviance'}
#clf1 = ensemble.GradientBoostingClassifier(**params)
clf1.fit(X8, Y8)
feature_importance8 = clf1.feature_importances_
# Make importances relative to max importance.
feature_importance8 = 100.0 * (feature_importance8 / feature_importance8.max())
sorted_idx = np.argsort(feature_importance8)
pos = np.arange(sorted_idx.shape[0]) + .5
plt.figure(figsize=(7, 30))
plt.subplot(1, 1, 1)
plt.barh(pos, feature_importance8[sorted_idx], align='center')
plt.yticks(pos, X8.columns[sorted_idx])
plt.xlabel('Relative Importance')
plt.title('MCS')
plt.show()
In [714]:
# View a list of the features and their importance scores
mcc = list(zip(X2.columns, feature_importance2))
mbd = list(zip(X3.columns, feature_importance3))
mvdm = list(zip(X4.columns, feature_importance4))
mrcb = list(zip(X5.columns, feature_importance5))
mtdhr = list(zip(X6.columns, feature_importance6))
mcxi = list(zip(X7.columns, feature_importance7))
mcs = list(zip(X8.columns, feature_importance8))
In [715]:
#Feature Importance for each program
names=['mcc','mbd', 'mvdm','mrcb','mtdhr','mcxi','mcs']
names2 = X2.columns
programfeatures = pd.DataFrame(list(zip(feature_importance2, feature_importance3, feature_importance4, feature_importance5,
feature_importance6, feature_importance7, feature_importance8)),columns = names, index=names2)
programfeatures = programfeatures.round(2)
Summary of relative importance of features to describe each program
In [716]:
#Summary Importance of features to describe each of the programs
programfeatures
Out[716]:
Data prepared for modelling
In [717]:
ratings8.Section.unique()
Out[717]:
In [718]:
ratings17 = ratings8[ratings8.Section != 'BIS']
ratings17['Section'] = ratings17['Section'].map({'MCC' :0,'MBD':1,'MVDM':2,'MRCB' : 3, 'MTDHR': 4, 'MCXI' : 5,'MCS' : 6 })
#Decision tree, which question is giving us more information
#Convert data so that we can run a tree
X10 = ratings17.drop(['Section','Gender'],axis=1)
Y10 = ratings17.Section.values
In [ ]: